home *** CD-ROM | disk | FTP | other *** search
/ Apple Developer Connection Student Program / ADC Tools Sampler CD Disk 3 1999.iso / Metrowerks CodeWarrior / Java Support / Java_Source / IFC_112 / netscape / application / HTMLParser.java < prev    next >
Encoding:
Text File  |  1999-05-28  |  22.7 KB  |  575 lines  |  [TEXT/CWIE]

  1. // HTMLParser.java
  2. // By Ned Etcode
  3. // Copyright 1995, 1996, 1997 Netscape Communications Corp. All rights reserved.
  4.  
  5. package netscape.application;
  6.  
  7. import netscape.util.*;
  8. import java.io.InputStream;
  9. import java.io.IOException;
  10. import java.io.FilterInputStream;
  11.  
  12.  
  13. /** A generic HTML parser. This class provides the HTML
  14.   * parsing functionality without defining how to store HTML.
  15.   * The user provides some information, telling the parser
  16.   * which class should be used for which marker.
  17.   * The parser creates instances of these classes.
  18.   *  @note 1.0 changes
  19.   *  @private
  20.   */
  21. public class HTMLParser extends FilterInputStream {
  22.  
  23.     /*
  24.      * Special char to unicode
  25.      */
  26.     private static final String specialChars[] = { "lt",  "<",
  27.                                      "gt",   ">",
  28.                                      "amp", "&",
  29.                                      "quot", "\"" ,
  30.                                      "nbsp","\u00a0",
  31.                                      "iexcl","\u00a1",
  32.                                      "cent","\u00a2",
  33.                                      "pound","\u00a3",
  34.                                      "curren","\u00a4",
  35.                                      "yen","\u00a5",
  36.                                      "brvbar","\u00a6",
  37.                                      "sect","\u00a7",
  38.                                      "uml","\u00a8",
  39.                                      "copy","\u00a9",
  40.                                      "ordf","\u00aa",
  41.                                      "laquo","\u00ab",
  42.                                      "not","\u00ac",
  43.                                      "shy","\u00ad",
  44.                                      "reg","\u00ae",
  45.                                      "macr","\u00af",
  46.                                      "deg","\u00b0",
  47.                                      "plusmn","\u00b1",
  48.                                      "sup2","\u00b2",
  49.                                      "sup3","\u00b3",
  50.                                      "acute","\u00b4",
  51.                                      "micro","\u00b5",
  52.                                      "para","\u00b6",
  53.                                      "middot","\u00b7",
  54.                                      "cedil","\u00b8",
  55.                                      "sup1","\u00b9",
  56.                                      "ordm","\u00ba",
  57.                                      "raquo","\u00bb",
  58.                                      "frac14","\u00bc",
  59.                                      "frac12","\u00bd",
  60.                                      "frac34","\u00be",
  61.                                      "iquest","\u00bf",
  62.                                      "Agrave","\u00c0",
  63.                                      "Aacute","\u00c1",
  64.                                      "Acirc","\u00c2",
  65.                                      "Atilde","\u00c3",
  66.                                      "Auml","\u00c4",
  67.                                      "Aring","\u00c5",
  68.                                      "AElig","\u00c6",
  69.                                      "Ccedil","\u00c7",
  70.                                      "Egrave","\u00c8",
  71.                                      "Eacute","\u00c9",
  72.                                      "Ecirc","\u00ca",
  73.                                      "Euml","\u00cb",
  74.                                      "Igrave","\u00cc",
  75.                                      "Iacute","\u00cd",
  76.                                      "Icirc","\u00ce",
  77.                                      "Iuml","\u00cf",
  78.                                      "ETH","\u00d0",
  79.                                      "Ntilde","\u00d1",
  80.                                      "Ograve","\u00d2",
  81.                                      "Oacute","\u00d3",
  82.                                      "Ocirc","\u00d4",
  83.                                      "Otilde","\u00d5",
  84.                                      "Ouml","\u00d6",
  85.                                      "times","\u00d7",
  86.                                      "Oslash","\u00d8",
  87.                                      "Ugrave","\u00d9",
  88.                                      "Uacute","\u00da",
  89.                                      "Ucirc","\u00db",
  90.                                      "Uuml","\u00dc",
  91.                                      "Yacute","\u00dd",
  92.                                      "THORN","\u00de",
  93.                                      "szlig","\u00df",
  94.                                      "agrave","\u00e0",
  95.                                      "aacute","\u00e1",
  96.                                      "acirc","\u00e2",
  97.                                      "atilde","\u00e3",
  98.                                      "auml","\u00e4",
  99.                                      "aring","\u00e5",
  100.                                      "aelig","\u00e6",
  101.                                      "ccedil","\u00e7",
  102.                                      "egrave","\u00e8",
  103.                                      "eacute","\u00e9",
  104.                                      "ecirc","\u00ea",
  105.                                      "euml","\u00eb",
  106.                                      "igrave","\u00ec",
  107.                                      "iacute","\u00ed",
  108.                                      "icirc","\u00ee",
  109.                                      "iuml","\u00ef",
  110.                                      "eth","\u00f0",
  111.                                      "ntilde","\u00f1",
  112.                                      "ograve","\u00f2",
  113.                                      "oacute","\u00f3",
  114.                                      "ocirc","\u00f4",
  115.                                      "otilde","\u00f5",
  116.                                      "ouml","\u00f6",
  117.                                      "divide","\u00f7",
  118.                                      "oslash","\u00f8",
  119.                                      "ugrave","\u00f9",
  120.                                      "uacute","\u00fa",
  121.                                      "ucirc","\u00fb",
  122.                                      "uuml","\u00fc",
  123.                                      "yacute","\u00fd",
  124.                                      "thorn","\u00fe",
  125.                                      "yuml","\u00ff",
  126.                                      "ensp"," ",
  127.                                      "emsp"," ",
  128.                                      "endash","-",
  129.                                      "emdash","-",
  130.                                    /*  "zwnj","\u200c",
  131.                                      "zwj", "\u200d",
  132.                                      "lrm", "\u200e",
  133.                                      "rlm", "\u200f",*/
  134.  
  135.     };
  136.  
  137.  
  138.     private HTMLTokenGenerator tokenGenerator;
  139.     private HTMLParsingRules rules;
  140.     private Class defaultContainerClass = null;
  141.     private Class defaultMarkerClass = null;
  142.     private boolean throwsException = false;
  143.     private FoundationApplet applet;
  144.  
  145.  
  146.     /** Constructor */
  147.     public HTMLParser(InputStream in) {
  148.         this(in,new HTMLParsingRules());
  149.     }
  150.  
  151.     public HTMLParser(InputStream in,HTMLParsingRules rules) {
  152.         super(in);
  153.         this.rules = rules;
  154.         tokenGenerator = new HTMLTokenGenerator(in);
  155.     }
  156.  
  157.     /** Set whether the parser should raise when some bad HTML is parsed.
  158.      *  if flag is false, bad statement will be just ignored
  159.      *  The default is false.
  160.      */
  161.     public void setThrowsExceptionOnHTMLError(boolean flag) {
  162.         throwsException = flag;
  163.     }
  164.  
  165.     /** Return whether the parser throw an exception when some bad HTML is parsed */
  166.     public boolean throwsExceptionOnHTMLError() {
  167.         return throwsException;
  168.     }
  169.  
  170.  
  171.    /**
  172.      *  Parse the next HTML component
  173.      */
  174.     public HTMLElement nextHTMLElement()
  175.         throws IOException,HTMLParsingException,
  176.         java.lang.InstantiationException,java.lang.IllegalAccessException {
  177.             HTMLElement result;
  178.         while( tokenGenerator.hasMoreTokens()) {
  179.             result =  parseNextHTMLElement(true,true,null);
  180.             if( result != null )
  181.                 return result;
  182.         }
  183.         return null;
  184.     }
  185.  
  186.  
  187.  
  188.     /**
  189.      *  Utility to convert String containing attributes to Hashtable
  190.      *  Keys will be converted to upper case.
  191.      */
  192.     public static Hashtable hashtableForAttributeString(String attributesString)
  193.         throws HTMLParsingException {
  194.         Hashtable result = new Hashtable();
  195.         int i,c;
  196.         String key,value;
  197.         FastStringBuffer fb = new FastStringBuffer();
  198.         int offset;
  199.  
  200.         if( attributesString == null )
  201.             return result;
  202.  
  203.         c = attributesString.length();
  204.         i = 0;
  205.         while( i < c ) {
  206.             while( i < c && isSpace(attributesString.charAt(i)) )
  207.                 i++;
  208.             if( i == c )
  209.                 break;
  210.             fb.truncateToLength( 0 );
  211.             offset = parseKeyOrValue( attributesString, i , fb );
  212.             if( offset == 0 ) {
  213.                 throw new HTMLParsingException("Error while parsing attributes " +
  214.                                               attributesString,0);
  215.             }
  216.  
  217.             key = filterKeyOrValue( fb );
  218.             key = key.toUpperCase();
  219.             i += offset;
  220.  
  221.             if( key.equals(""))
  222.                 continue;
  223.  
  224.             while( i < c && isSpace(attributesString.charAt(i)) )
  225.                 i++;
  226.  
  227.             if( i < c && attributesString.charAt(i) == '=' ) { /* We have a value */
  228.                 i++;
  229.                 fb.truncateToLength( 0 );
  230.                 offset = parseKeyOrValue( attributesString, i, fb );
  231.                 value = filterKeyOrValue( fb );
  232.                 i += offset;
  233.                 result.put(key,value);
  234.             } else { /* Attribute without a value */
  235.                 result.put(key,"");
  236.             }
  237.         }
  238.         return result;
  239.     }
  240.  
  241.    /** Called on syntax error. Throw an exception if HTMLParsingException is
  242.      * enabled. Otherwise does nothing.
  243.      */
  244.     public void reportSyntaxError(String description) throws HTMLParsingException {
  245.         if( throwsException )
  246.             throw new HTMLParsingException( description , tokenGenerator.lineForLastToken());
  247.     }
  248.  
  249.     /** Convenience to avoid breaking constructor */
  250.     public void setClassForMarker(Class aClass,String aMarker) {
  251.         rules.setClassNameForMarker(aClass.getName(),aMarker);
  252.     }
  253.  
  254.     private final char unicodeCharForBytes( String bytes ) {
  255.         int i,c;
  256.         String s = bytes;
  257.         if( s.length() > 0 && s.charAt(0) == '#' ) {
  258.             return (char) Integer.parseInt(s.substring(1,s.length()));
  259.         }
  260.         for(i = 0 , c = specialChars.length ; i < c ; i += 2 ) {
  261.             if( specialChars[i].equals( s ))
  262.                 return specialChars[i+1].charAt(0);
  263.         }
  264.         return 0;
  265.     }
  266.  
  267.     private final int convertSpecialCharacter(String s,int startIndex,FastStringBuffer result ){
  268.         int length = s.length();
  269.         char theChar;
  270.  
  271.         if( (startIndex+1) < length ) {
  272.             int start = startIndex + 1;
  273.             int end = start;
  274.             char ch;
  275.  
  276.             ch = s.charAt(end);
  277.             while( end < length && ch != ';' && ch != ' ' && ch != '\n' && ch != '\t' ) {
  278.                 end++;
  279.                 if( end < length )
  280.                     ch = s.charAt(end);
  281.                 else
  282.                     ch = 0;
  283.             }
  284.  
  285.             if( end > start ) {
  286.                 String subStr;
  287.                 subStr = s.substring(start,start+(end-start));
  288.                 theChar = unicodeCharForBytes( subStr );
  289.                 if( theChar != 0 && theChar != 8 )
  290.                     result.append( theChar );
  291.  
  292.                 if( end < length && s.charAt(end) == ';')
  293.                     return subStr.length() + 2; /* + 1 for the starting & and the ; */
  294.                 else
  295.                     return subStr.length() + 1;
  296.             }
  297.         }
  298.         return 0;
  299.     }
  300.  
  301.     private final String filterHTMLString(String s,boolean filterSpaces,
  302.                                           boolean allowSpaceForFirstChar) {
  303.         FastStringBuffer sb = new FastStringBuffer();
  304.         int i,c,delta;
  305.         char ch;
  306.         boolean previousCharWasSpace = false;
  307.         boolean nonSpaceCharFound = false;
  308.         for(i=0,c=s.length() ; i < c ; i++) {
  309.             ch = s.charAt(i);
  310.  
  311.             if(filterSpaces && (ch == ' ' || ch == '\t' || ch == '\n') ) {
  312.                 if( !nonSpaceCharFound &&
  313.                     ((allowSpaceForFirstChar && (ch == '\t' || ch == '\n')) ||
  314.                      (!allowSpaceForFirstChar && (ch == '\t' || ch == '\n' || ch == ' '))))
  315.                     continue;
  316.                 if( previousCharWasSpace )
  317.                     continue;
  318.                 else {
  319.                     previousCharWasSpace = true;
  320.                     sb.append(' ');
  321.                     continue;
  322.                 }
  323.             } else if( ch == '&' ) {
  324.                 delta = convertSpecialCharacter(s,i,sb );
  325.                 if( delta > 0 )
  326.                     i += (delta - 1); /* -1 since i++ will happen before the next iteration */
  327.                 previousCharWasSpace = false;
  328.                 nonSpaceCharFound = true;
  329.                 continue;
  330.             } else if(ch != '\n' && ch != '\t' &&
  331.                       (ch < ' ' || ch > '~') ) /* Should filter these characters */
  332.                 continue;
  333.             previousCharWasSpace = false;
  334.             nonSpaceCharFound = true;
  335.             sb.append( ch );
  336.         }
  337.         if( sb.length() > 0 )
  338.             return sb.toString();
  339.         else
  340.             return null;
  341.     }
  342.  
  343.     private Class classForMarker(String aMarker) {
  344.         String className = rules.classNameForMarker(aMarker);
  345.         if( className != null ) {
  346.             Class c;
  347.  
  348.             try {
  349.                 Application app = Application.application();
  350.                 if(app != null)
  351.                     c = app.classForName(className);
  352.                 else
  353.                     c = Class.forName(className);
  354.             } catch(ClassNotFoundException e) {
  355.                 System.err.println("" + e);
  356.                 c = null;
  357.             }
  358.             return c;
  359.         }
  360.         return null;
  361.     }
  362.  
  363.     private final HTMLElement parseNextHTMLElement(boolean doFilterStrings,
  364.                                                        boolean allowSpaceAsFirstChar,
  365.                                                        String pMarker)
  366.         throws IOException,HTMLParsingException,
  367.         java.lang.InstantiationException,java.lang.IllegalAccessException {
  368.         int token;
  369.         HTMLElement result = null;
  370.         Class c;
  371.         String marker;
  372.         Hashtable markerRules;
  373.  
  374.         token = tokenGenerator.nextToken();
  375.         switch( token ) {
  376.         case HTMLTokenGenerator.STRING_TOKEN:
  377.             if((c = classForMarker(HTMLParsingRules.STRING_MARKER_KEY)) != null ) {
  378.                 String s = tokenGenerator.stringForLastToken();
  379.                 s = filterHTMLString(s,doFilterStrings,allowSpaceAsFirstChar);
  380.                 if( s != null ) { /* Filter might remove string with only spaces */
  381.                     result = (HTMLElement) c.newInstance();
  382.                     result.setMarker(HTMLParsingRules.STRING_MARKER_KEY);
  383.                     result.setString( s );
  384.                     return result;
  385.                 }
  386.             }
  387.             break;
  388.         case HTMLTokenGenerator.MARKER_BEGIN_TOKEN:
  389.             marker = tokenGenerator.stringForLastToken();
  390.             markerRules = rules.rulesForMarker(marker);
  391.             if( (c = classForMarker(marker)) != null) {
  392.                 if( rules.isContainer(markerRules)) {
  393.                     HTMLElement nextChild;
  394.                     Vector beginTerminators = null;
  395.                     Vector endTerminators   = null;
  396.                     Object children[],tmp[];
  397.                     int childrenCount;
  398.                     boolean endMarkerFound = false;
  399.                     boolean notFirstChild = false;
  400.                     result = (HTMLElement) c.newInstance();
  401.                     result.setMarker( marker );
  402.                     result.setAttributes( tokenGenerator.attributesForLastToken());
  403.  
  404.                     children = new Object[2];
  405.                     childrenCount = 0;
  406.                     if( markerRules != null ) {
  407.                         beginTerminators = (Vector) markerRules.get(
  408.                                                  HTMLParsingRules.BEGIN_TERMINATION_MARKERS_KEY);
  409.                         endTerminators   = (Vector) markerRules.get(
  410.                                                  HTMLParsingRules.END_TERMINATION_MARKERS_KEY);
  411.                     }
  412.  
  413.                     while( tokenGenerator.hasMoreTokens() ) {
  414.                         token = tokenGenerator.peekNextToken();
  415.                         if( token == HTMLTokenGenerator.MARKER_END_TOKEN ) {
  416.                             String endMarker = tokenGenerator.stringForLastToken();
  417.                             if(marker.equals(endMarker)) {
  418.                                 tokenGenerator.nextToken(); /* Remove the token */
  419.                                 endMarkerFound = true;
  420.                                 break;
  421.                             } else if( endTerminators != null &&
  422.                                        endTerminators.indexOf(endMarker)!=-1) {
  423.                                 endMarkerFound=true;
  424.                                 break;
  425.                             } else if(classForMarker(endMarker) != null) {
  426.                                 /** Unexpected end for a known marker
  427.                                  *  This is an error but we should
  428.                                  *  stop parsing the current marker.
  429.                                  *  to allow the known marker to be
  430.                                  *  closed. This strategy avoid having
  431.                                  *  very deep trees when some closing
  432.                                  *  markers are not in the right scope
  433.                                  */
  434.                                 reportSyntaxError("Unexcpected closing " + endMarker +
  435.                                                   " while parsing contents for " + marker );
  436.                                 endMarkerFound=true;
  437.                                 break;
  438.                             }
  439.                         } else if( token == HTMLTokenGenerator.MARKER_BEGIN_TOKEN &&
  440.                                    beginTerminators != null &&
  441.                                    beginTerminators.indexOf(tokenGenerator.stringForLastToken())
  442.                                    != -1 ) {
  443.                             endMarkerFound = true;
  444.                             break;
  445.                         }
  446.                         /* Should filter strings if the marker requires it or
  447.                          * one of the parent requires it.
  448.                          */
  449.                         if( rules.shouldFilterStringsForChildren(markerRules) &&
  450.                             doFilterStrings==true )
  451.                             nextChild = parseNextHTMLElement(true,notFirstChild,marker);
  452.                         else
  453.                             nextChild = parseNextHTMLElement(false,notFirstChild,marker);
  454.                         notFirstChild = true;
  455.                         if( nextChild == null ) {
  456.                             if( tokenGenerator.hasMoreTokens() == false ) {
  457.                                 reportSyntaxError("Unterminated marker " + marker);
  458.                                 break;
  459.                             } else
  460.                                 continue;
  461.                         } else {
  462.                             children[childrenCount++] = nextChild;
  463.                             if( childrenCount == children.length ) {
  464.                                 Object newChildren[] = new Object[children.length * 2];
  465.                                 System.arraycopy(children,0,newChildren,0,childrenCount);
  466.                                 children = newChildren;
  467.                             }
  468.                         }
  469.                     }
  470.  
  471.                     if( childrenCount > 0 ) {
  472.                         tmp = new Object[childrenCount];
  473.                         System.arraycopy(children,0,tmp,0,childrenCount);
  474.                         result.setChildren( tmp );
  475.                     } else
  476.                         result.setChildren( null );
  477.  
  478.                     if(! endMarkerFound ) {
  479.                         reportSyntaxError("No end found for marker " + marker);
  480.                     }
  481.                     return result;
  482.                 } else {
  483.                     result = (HTMLElement) c.newInstance();
  484.                     result.setMarker( marker );
  485.                     result.setAttributes( tokenGenerator.attributesForLastToken());
  486.                     return result;
  487.                 }
  488.             }
  489.             break;
  490.         case HTMLTokenGenerator.COMMENT_TOKEN:
  491.             if((c = classForMarker(HTMLParsingRules.COMMENT_MARKER_KEY)) != null ) {
  492.                 String s = tokenGenerator.stringForLastToken();
  493.                 result = (HTMLElement) c.newInstance();
  494.                 result.setMarker(HTMLParsingRules.COMMENT_MARKER_KEY);
  495.                 result.setString( s );
  496.                 return result;
  497.             }
  498.             break;
  499.         case HTMLTokenGenerator.MARKER_END_TOKEN:
  500.             marker = tokenGenerator.stringForLastToken();
  501.             c = classForMarker(marker);
  502.             if( c != null && !rules.shouldIgnoreEnd( rules.rulesForMarker( marker )))   {
  503.                 reportSyntaxError("Unexpected closing " + marker +
  504.                             " while parsing contents for marker " + pMarker);
  505.             }
  506.             break;
  507.         default:
  508.             reportSyntaxError("Unexpected statement");
  509.         }
  510.         return null;
  511.     }
  512.  
  513.  
  514.  
  515.  
  516.    private static boolean isSpace(char c) {
  517.        if( c == ' ' || c == '\t' || c == '\n' )
  518.            return true;
  519.        else
  520.            return false;
  521.    }
  522.  
  523.    private static int parseKeyOrValue(String source,int index,FastStringBuffer dest) {
  524.        int start,end,length;
  525.        start = index;
  526.        length = source.length();
  527.        char endChar = 0;
  528.  
  529.        while(start < length && isSpace(source.charAt(start)))
  530.            start++;
  531.  
  532.        if( start == length )
  533.            return 0;
  534.  
  535.        end = start;
  536.        if( source.charAt(end) == '\'' ||
  537.            source.charAt(end) == '"' )
  538.            endChar = source.charAt(end);
  539.        do {
  540.            dest.append(source.charAt(end));
  541.            end++;
  542.        } while(end < length &&
  543.              ((endChar == 0 && !isSpace(source.charAt(end)) && source.charAt(end) != '=' ) ||
  544.               (endChar != 0 && source.charAt(end) != endChar)));
  545.  
  546.        if( end < length && source.charAt(end) == endChar ) {
  547.            dest.append(source.charAt(end));
  548.            end++;
  549.        }
  550.        return end - start;
  551.    }
  552.  
  553.    /* Remove " or '.
  554.     */
  555.    private static String filterKeyOrValue(FastStringBuffer source) {
  556.        int c = source.length();
  557.  
  558.        if( c == 0 )
  559.            return "";
  560.  
  561.        if( source.charAt(0) == '\''  || source.charAt(0) == '"' ) {
  562.            if( c <= 2 )
  563.                return "";
  564.            else
  565.                return source.toString().substring(1,c-1);
  566.        }
  567.        return source.toString();
  568.    }
  569.  
  570. }
  571.  
  572.  
  573.  
  574.  
  575.